Dependencies

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.3.0     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)
library(klaR)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
data.tb <- read_csv("./data/BlackFriday.csv")
## Parsed with column specification:
## cols(
##   User_ID = col_double(),
##   Product_ID = col_character(),
##   Gender = col_character(),
##   Age = col_character(),
##   Occupation = col_double(),
##   City_Category = col_character(),
##   Stay_In_Current_City_Years = col_character(),
##   Marital_Status = col_double(),
##   Product_Category_1 = col_double(),
##   Product_Category_2 = col_double(),
##   Product_Category_3 = col_double(),
##   Purchase = col_double()
## )
data.tb %>% head(25)
## # A tibble: 25 x 12
##    User_ID Product_ID Gender Age   Occupation City_Category
##      <dbl> <chr>      <chr>  <chr>      <dbl> <chr>        
##  1 1000001 P00069042  F      0-17          10 A            
##  2 1000001 P00248942  F      0-17          10 A            
##  3 1000001 P00087842  F      0-17          10 A            
##  4 1000001 P00085442  F      0-17          10 A            
##  5 1000002 P00285442  M      55+           16 C            
##  6 1000003 P00193542  M      26-35         15 A            
##  7 1000004 P00184942  M      46-50          7 B            
##  8 1000004 P00346142  M      46-50          7 B            
##  9 1000004 P0097242   M      46-50          7 B            
## 10 1000005 P00274942  M      26-35         20 A            
## # ... with 15 more rows, and 6 more variables:
## #   Stay_In_Current_City_Years <chr>, Marital_Status <dbl>,
## #   Product_Category_1 <dbl>, Product_Category_2 <dbl>,
## #   Product_Category_3 <dbl>, Purchase <dbl>
test.tb <- data.tb %>% head(2000)

find unique values for Age

unique(test.tb$Age)  
## [1] "0-17"  "55+"   "26-35" "46-50" "51-55" "36-45" "18-25"

functions to filter Ages into return values (numeric representations of the group)

ageFilter <- function(age) {
  switch(age, "0-17"=1, "18-25"=2, "26-35"=3, "36-45"=4, "46-50"=5, "51-55"=6, "55+"=7)
}
ageFilter("51-55")
## [1] 6

pip Age vector into ageFilter(), append to tibble

##data.tb$age <- ageFilter(data.tb$Age)
test.tb$age <- test.tb$Age

#data.tb %>% head()
for (i in 1:length(test.tb$age)) {
  test.tb$age[i] <- ageFilter(test.tb$age[i])
  ##print(test.tb$Age + " " + test.tb$age)
}

Regression: Age vs Purchase Value

fit <- lm(formula= test.tb$Purchase ~ test.tb$age + 0, data=test.tb)
fit[1]
## $coefficients
## test.tb$age1 test.tb$age2 test.tb$age3 test.tb$age4 test.tb$age5 
##     9968.213     9256.855     9785.878     9799.988     9218.589 
## test.tb$age6 test.tb$age7 
##     8482.071     8192.040

Correlation: Age vs Purchase Value

cor(test.tb$age %>% as.numeric(), test.tb$Purchase %>% as.numeric())
## [1] -0.04984716
test_na.tb <- na.omit(test.tb)

plot <- plot_ly(
  x = c(test_na.tb$Age),
  y = c(test_na.tb$Purchase),
  name = "Age vs. Purchase Amount",
  type = "bar"
)

plot